Source code for hysop.backend.device.codegen.kernels.custom_symbolic

# Copyright (c) HySoP 2011-2024
# This file is part of HySoP software.
# See ""
# for further info.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.

import contextlib, math, operator, hashlib
from contextlib import contextmanager
import sympy as sm

from abc import ABCMeta, abstractmethod
from hysop import __VERBOSE__, __KERNEL_DEBUG__
from hysop.core.arrays.all import OpenClArray
from import nested
from hysop.constants import (

from import Utils, upper_pow2_or_3
from import check_instance, first_not_None, to_tuple
from import npw
from import is_complex
from hysop.fields.cartesian_discrete_field import CartesianDiscreteField

from hysop.numerics.remesh.remesh import RemeshKernel
from hysop.fields.continuous_field import Field
from hysop.fields.discrete_field import DiscreteScalarFieldView
from hysop.symbolic import space_symbols as symbolic_space_symbols
from hysop.symbolic import local_indices_symbols as symbolic_local_indices
from hysop.symbolic.array import (

from hysop.backend.device.opencl import cl, clTools, clCharacterize
from hysop.backend.device.opencl.opencl_env import OpenClEnvironment
from hysop.backend.device.opencl.opencl_types import OpenClTypeGen
from hysop.backend.device.opencl.opencl_array_backend import OpenClArrayBackend

from hysop.backend.device.codegen import CodeGeneratorWarning
from hysop.backend.device.codegen.base.utils import WriteOnceDict, ArgDict, SortedDict
from hysop.backend.device.codegen.base.statistics import WorkStatistics
from hysop.backend.device.codegen.base.variables import CodegenStruct
from hysop.backend.device.codegen.base.opencl_codegen import OpenClCodeGenerator
from hysop.backend.device.codegen.base.kernel_codegen import KernelCodeGenerator
from hysop.backend.device.codegen.base.variables import (
from hysop.backend.device.codegen.structs.mesh_info import (
from hysop.backend.device.codegen.symbolic.functions.custom_symbolic_function import (

from hysop.operator.base.custom_symbolic_operator import ValidExpressions
from hysop.symbolic.field import SymbolicDiscreteField
from hysop.symbolic.relational import Assignment
from hysop.symbolic.misc import TimeIntegrate

from hysop.backend.device.codegen.symbolic.expr import (

[docs] class SymbolicCodegenContext: """Store all information required to generate custom code.""" def __init__( self, typegen, expr_info, ftype, itype, vectorization, granularity, kernel_dim, use_short_circuit, work_dim, known_vars, tuning_mode, debug_mode, symbolic_mode, ): vftype = typegen.vtype(ftype, vectorization) vitype = typegen.vtype(itype, vectorization) vgranularity_dim = upper_pow2_or_3(granularity) gftype = typegen.vtype(ftype, vgranularity_dim) gitype = typegen.vtype(itype, vgranularity_dim) vkernel_dim = upper_pow2_or_3(kernel_dim) kftype = typegen.vtype(ftype, vkernel_dim) kitype = typegen.vtype(itype, vkernel_dim) array_dim = kernel_dim + granularity varray_dim = upper_pow2_or_3(array_dim) aftype = typegen.vtype(ftype, varray_dim) aitype = typegen.vtype(itype, varray_dim) self.expr_info = expr_info self.typegen = typegen self.work_dim = work_dim self.array_dim = array_dim self.kernel_dim = kernel_dim self.granularity = granularity self.varray_dim = varray_dim self.vkernel_dim = vkernel_dim self.vgranularity_dim = vgranularity_dim self.itype, self.ftype = itype, ftype self.vitype, self.vftype = vitype, vftype self.gitype, self.gftype = gitype, gftype self.kitype, self.kftype = kitype, kftype self.aitype, self.aftype = aitype, aftype self.vectorization = vectorization self.use_short_circuit = use_short_circuit self.local_size_known = "local_size" in known_vars self.tuning_mode = tuning_mode self.debug_mode = debug_mode self.symbolic_mode = symbolic_mode self.known_vars = known_vars self.array_sizes = SortedDict() self.array_ghosts = SortedDict() self.array_contiguous_ghosts = SortedDict() self.buffer_args = SortedDict() self.compute_work_per_step() self.generate_args()
[docs] def compute_work_per_step(self): expr_info = self.expr_info nsteps = expr_info.nsteps nlhsobjects = expr_info.nlhsobjects extra_work_per_step = npw.int_zeros(shape=(nsteps, nlhsobjects)) min_ghosts_per_integration_step = expr_info.min_ghosts_per_integration_step if nsteps > 1: extra_work_per_step[0 : nsteps - 1] = min_ghosts_per_integration_step[:-1][ ::-1 ] extra_vwork_per_step = extra_work_per_step + self.vectorization - 1 extra_vwork_per_step //= self.vectorization if nlhsobjects > 0: max_extra_vwork = npw.max(extra_vwork_per_step[0]) else: max_extra_vwork = 0 self.extra_vwork_per_step = extra_vwork_per_step self.max_extra_vwork = max_extra_vwork
[docs] def array_size(self, varname, index=None): assert varname in self.array_sizes, self.array_sizes.keys() sizes = self.array_sizes[varname] if index is not None: sizes = sizes[index] (Smem, A, B) = sizes if self.local_size_known: assert A == B == 0, f"A={A}, B={B}" assert Smem >= 0, f"Smem={Smem}" return str(Smem) else: assert Smem == 0, f"Smem={Smem}" return f"{A}*{self.local_size}+{B}"
[docs] def array_ghost(self, varname, index=None): assert varname in self.array_ghosts, self.array_ghosts.keys() ghosts = self.array_ghosts[varname] if index is not None: ghosts = ghosts[index] return ghosts
[docs] def generate_args(self): args = ArgDict() self.common_args = self.generate_common_args() self.field_args = self.generate_field_args() self.array_args = self.generate_array_args() self.param_args = self.generate_param_args() self.scalar_args = self.generate_scalar_args() args.update(self.common_args) args.update(self.field_args) args.update(self.array_args) args.update(self.param_args) args.update(self.scalar_args) self.args = args
[docs] def generate_common_args(self): tg = self.typegen args = ArgDict() args["offset"] = CodegenVariable("offset", self.itype, tg, const=True) args["local_offset"] = CodegenVariable( "local_offset", self.itype, tg, const=True ) args["line_offset"] = CodegenVariable("line_offset", self.itype, tg, const=True) args["full_offset"] = CodegenVariable("full_offset", self.itype, tg, const=True) args["last_offset"] = CodegenVariable( "last_offset", self.itype, tg, const=True, nl=True ) args["is_first"] = CodegenVariable("is_first", "bool", tg, const=True) args["is_last"] = CodegenVariable("is_last", "bool", tg, const=True) args["is_active"] = CodegenVariable("is_active", "bool", tg, const=True) args["is_first_active"] = CodegenVariable( "is_first_active", "bool", tg, const=True, nl=True ) args["is_last_active"] = CodegenVariable( "is_last_active", "bool", tg, const=True, nl=True ) args["is_active_boundary"] = CodegenVariable( "is_active_boundary", "bool", tg, const=True, nl=True ) args["lid"] = CodegenVariable("lid", self.itype, tg, const=True) args["local_work"] = CodegenVariable("lwork", self.itype, tg, const=True) args["current_local_work"] = CodegenVariable( "clwork", self.itype, tg, const=True, nl=True ) args["compute_grid_size"] = CodegenVectorClBuiltin( "compute_grid_size", self.itype, self.varray_dim, typegen=tg, value=self.expr_info.compute_resolution[::-1], const=True, nl=True, ) args["dx"] = CodegenVariable("dx", tg.fbtype, tg, const=True) self.space_symbols = SortedDict() for i, xi in enumerate(symbolic_space_symbols[: self.varray_dim]): if i == 0: args[xi.varname] = CodegenVectorClBuiltin( xi.varname, self.ftype, self.vectorization, typegen=tg ) else: args[xi.varname] = CodegenVariable(xi.varname, self.ftype, tg) self.space_symbols[xi] = args[xi.varname] self.local_indices_symbols = SortedDict() for i, Li in enumerate(symbolic_local_indices[: self.varray_dim]): if i == 0: args[Li.varname] = CodegenVectorClBuiltin( Li.varname, self.itype, self.vectorization, typegen=tg ) else: args[Li.varname] = CodegenVariable(Li.varname, self.itype, tg) self.local_indices_symbols[Li] = args[Li.varname] if self.local_size_known: local_size = self.known_vars["local_size"][0] else: local_size = CodegenVariable("L", self.itype, tg, const=True) args["L"] = local_size self.local_size = local_size for argname, arg in args.items(): setattr(self, argname, arg) return args
[docs] def generate_field_args(self): typegen = self.typegen expr_info = self.expr_info min_ghosts = expr_info.min_ghosts_per_components write_counter = expr_info.discretization_info.write_counter read_counter = expr_info.discretization_info.read_counter args = ArgDict() array_ghosts = SortedDict() array_contiguous_ghosts = SortedDict() array_sizes = SortedDict() dfields = { f.dfield for f in set(expr_info.input_dfields.values()).union( expr_info.output_dfields.values() ) } for dfield in dfields: field = dfield._field ctype = dfield.ctype name = dfield.var_name.lower() if name == dfield.var_name: name = "_" + name name = f"{name}_{{}}" reads = read_counter.get(dfield, None) writes = write_counter.get(dfield, None) local_size_per_index = array_sizes.setdefault( dfield, npw.int_zeros(shape=(dfield.nb_components, 3)) ) for index in range(dfield.nb_components): is_read = (reads is not None) and (reads[index] > 0) is_written = (writes is not None) and (writes[index] > 0) if not (is_read or is_written): continue ghosts = min_ghosts[field][index] has_ghosts = ghosts > 0 cname = name.format(index) array_ghosts[cname] = ghosts if has_ghosts: args[cname] = CodegenVariable( name=cname, typegen=typegen, ctype=ctype, ptr=True, ptr_restrict=True, const=True, storage="__local", add_impl_const=True, nl=True, ) if self.local_size_known: local_size_per_index[index] = ( self.vectorization * self.local_size + 2 * ghosts, 0, 0, ) else: local_size_per_index[index] = ( 0, self.vectorization, 2 * ghosts, ) else: args[cname] = CodegenVectorClBuiltin( cname, ctype, self.vectorization, typegen=typegen, const=True, nl=True, ) array_contiguous_ghosts[dfield] = min_ghosts[field].copy() array_ghosts.update(array_contiguous_ghosts) self.array_sizes.update(array_sizes) self.array_ghosts.update(array_ghosts) self.array_contiguous_ghosts.update(array_contiguous_ghosts) return args
[docs] def generate_array_args(self): typegen = self.typegen expr_info = self.expr_info min_ghosts = expr_info.min_ghosts_per_components write_counter = expr_info.discretization_info.write_counter read_counter = expr_info.discretization_info.read_counter args = ArgDict() array_ghosts = SortedDict() array_contiguous_ghosts = SortedDict() array_sizes = SortedDict() arrays = set(expr_info.input_arrays.values()).union( expr_info.output_arrays.values() ) for a in arrays: ctype = a.ctype name = a.varname.lower() if name == a.varname: name = "_" + name reads = read_counter.get(a, 0) writes = write_counter.get(a, 0) ghosts = min_ghosts.setdefault(a, npw.asintarray([0])) is_read = reads > 0 is_written = writes > 0 has_ghosts = ghosts > 0 assert is_read or is_written local_size = 0 array_ghosts[name] = ghosts if has_ghosts: args[name] = CodegenVariable( name=name, typegen=typegen, ctype=ctype, ptr=True, ptr_restrict=True, const=True, storage="__local", add_impl_const=True, nl=True, ) if self.local_size_known: local_size = ( self.vectorization * self.local_size + 2 * ghosts, 0, 0, ) else: local_size = (0, self.vectorization, 2 * ghosts) else: args[name] = CodegenVectorClBuiltin( name, ctype, self.vectorization, typegen=typegen, const=True, nl=True, ) local_size = 0 array_sizes[a] = local_size array_contiguous_ghosts[a] = min_ghosts[a].copy() array_ghosts.update(array_contiguous_ghosts) self.array_sizes.update(array_sizes) self.array_ghosts.update(array_ghosts) self.array_contiguous_ghosts.update(array_contiguous_ghosts) return args
[docs] def generate_param_args(self): typegen = self.typegen expr_info = self.expr_info args = ArgDict() # READ ONLY PARAMETERS # (ndim<=1) and (1<=size<=16) => simple vector constant # (ndim>1) or (size>16) => ptr (const) __constant memory space for pname, param in expr_info.input_params.items(): assert pname not in expr_info.output_params shape = param.shape ctype = param.ctype if (len(shape) == 0) or ((len(shape) == 1) and (shape[0] <= 16)): vsize = upper_pow2_or_3(shape[0]) if (len(shape) == 1) else 1 arg = CodegenVectorClBuiltin( pname, ctype, vsize, typegen=typegen, const=True, nl=True ) else: storage = "__constant" arg = CodegenVariable( name=pname, typegen=typegen, ctype=ctype, ptr=True, ptr_restrict=True, const=True, storage=storage, add_impl_const=True, nl=True, ) args[pname] = arg # OUTPUT PARAMETERS # not supported yet (should be non const __global ptrs). for pname, param in expr_info.output_params.items(): raise NotImplementedError("Output parameters are not supported.") return args
[docs] def generate_scalar_args(self): typegen = self.typegen expr_info = self.expr_info args = ArgDict() for sname, scalar in expr_info.scalars.items(): ctype = scalar.ctype vsize = self.vectorization scalar = CodegenVectorClBuiltin( sname, ctype, vsize, typegen=typegen, const=True, nl=True ) args[sname] = scalar return args
[docs] class CustomSymbolicKernelGenerator(KernelCodeGenerator, metaclass=ABCMeta):
[docs] @classmethod def create(cls, expr_info, **kwds): """Kernel generator factory that handles different expression types.""" if expr_info.kind == SymbolicExpressionKind.AFFECT: from hysop.backend.device.codegen.symbolic.kernels.custom_symbolic_affect import ( CustomSymbolicAffectKernelGenerator, ) return CustomSymbolicAffectKernelGenerator(expr_info=expr_info, **kwds) elif expr_info.kind == SymbolicExpressionKind.TIME_INTEGRATE: from hysop.backend.device.codegen.symbolic.kernels.custom_symbolic_time_integrate import ( CustomSymbolicTimeIntegrateKernelGenerator, ) return CustomSymbolicTimeIntegrateKernelGenerator( expr_info=expr_info, **kwds ) else: msg = "Expression kind {} is not supported yet." msg = msg.format(expr_info.kind) raise RuntimeError(msg)
[docs] @abstractmethod def custom_name(cls): pass
[docs] @abstractmethod def generate_expr_code(self): pass
[docs] @classmethod def codegen_name( cls, work_dim, array_dim, kernel_dim, granularity, ftype, vectorization, name, direction, ): return "{}__{}d_kdim{}_wdim{}_gr{}__{}_v{}".format( name, array_dim, kernel_dim, work_dim, granularity, ftype, vectorization )
def __init__( self, typegen, expr_info, ftype, kernel_dim, work_dim, granularity, vectorization, itype="int", use_short_circuit=None, symbolic_mode=False, debug_mode=False, tuning_mode=False, known_vars=None, ): assert vectorization in [1, 2, 4, 8, 16] use_short_circuit = first_not_None( use_short_circuit, typegen.use_short_circuit_ops ) known_vars = first_not_None(known_vars, {}) csc = SymbolicCodegenContext( typegen, expr_info, ftype, itype, vectorization, granularity, kernel_dim, use_short_circuit, work_dim, known_vars, tuning_mode, debug_mode, symbolic_mode, ) name = self.codegen_name( work_dim, csc.array_dim, csc.kernel_dim, csc.granularity, csc.ftype, csc.vectorization,, expr_info.direction, ) kernel_reqs = self.build_requirements(csc) kernel_args = self.gen_kernel_arguments(csc, kernel_reqs) expr_reqs = self.build_expr_requirements( csc, kernel_reqs, kernel_args, known_vars ) kernel_reqs.update(expr_reqs) super().__init__( name=name, typegen=typegen, work_dim=work_dim, kernel_args=kernel_args, known_vars=known_vars, vec_type_hint=ftype, symbolic_mode=symbolic_mode, ) self.update_requirements(kernel_reqs) = csc self.gencode()
[docs] def build_requirements(self, csc): typegen = csc.typegen reqs = WriteOnceDict() # discrete cartesian fields mesh info mesh_base_struct = MeshBaseStruct(typegen=typegen, vsize=csc.varray_dim) reqs["MeshBaseStruct"] = mesh_base_struct mesh_info_struct = MeshInfoStruct(typegen=typegen, vsize=csc.varray_dim) reqs["MeshInfoStruct"] = mesh_info_struct return reqs
[docs] @abstractmethod def build_expr_requirements(self, csc, kernel_reqs, kernel_args): """Generate requirements and generate new expressions.""" return WriteOnceDict()
[docs] def required_workgroup_cache_size(self, local_work_size): """ Return a tuple of required (static,dynamic,total) cache bytes per workgroup """ work_dim = self.work_dim local_mem_size = self.local_mem_size local_work_size = npw.asarray(local_work_size) sc = local_mem_size[0] dc = local_mem_size[1] * local_work_size[0] + local_mem_size[2] tc = sc + dc if dc > 0: msg = "Dynamic cache has not been implemented yet, " msg += "please specify local_work_size in known_vars." raise NotImplementedError(msg) return (sc, dc, tc)
[docs] def gen_kernel_arguments(self, csc, kernel_reqs): expr_info = csc.expr_info typegen = csc.typegen kargs = ArgDict() # declare all array like arguments mesh_infos = SortedDict() param_args = SortedDict() array_args = SortedDict() array_strides = SortedDict() # read-only input fields ei = expr_info di = expr_info.discretization_info for obj, counts in di.read_counter.items(): assert counts is not None if npw.array_equal(counts, 0): continue if isinstance(obj, di.IndexedCounterTypes): assert isinstance(obj, DiscreteScalarFieldView) dfield = obj args = array_args.setdefault(obj, {}) strides = array_strides.setdefault(obj, {}) mesh_info_name = f"{dfield.var_name}_mesh_info" mesh_info = kernel_reqs["MeshInfoStruct"].build_codegen_variable( const=True, name=mesh_info_name ) assert dfield not in mesh_infos mesh_infos[dfield] = mesh_info_name kargs[mesh_info_name] = mesh_info for i, count in enumerate(counts): if count == 0: continue if (dfield in di.write_counter) and di.write_counter[dfield][i] > 0: continue vname = dfield.var_name + "_" + str(i) volatile = vname in ei.is_volatile (arg, stride) = OpenClArrayBackend.build_codegen_arguments( kargs, name=vname, known_vars=csc.known_vars, symbolic_mode=csc.symbolic_mode, storage=self._global, ctype=dfield.ctype, typegen=typegen, mesh_dim=csc.varray_dim, ptr_restrict=True, const=True, volatile=volatile, ) assert i not in args assert i not in strides args[i] = arg strides[i] = stride elif isinstance(obj, di.SimpleCounterTypes): assert isinstance( obj, (OpenClSymbolicArray, OpenClSymbolicBuffer, OpenClSymbolicNdBuffer), ), type(obj) assert counts > 0 if (obj in di.write_counter) and (di.write_counter[obj] > 0): continue vname = obj.varname volatile = vname in ei.is_volatile (arg, stride) = OpenClArrayBackend.build_codegen_arguments( kargs, name=vname, known_vars=csc.known_vars, symbolic_mode=csc.symbolic_mode, storage=self._global, ctype=obj.ctype, typegen=typegen, mesh_dim=csc.varray_dim, ptr_restrict=True, const=True, volatile=volatile, ) if isinstance(obj, (OpenClSymbolicBuffer, OpenClSymbolicNdBuffer)): csc.buffer_args[obj] = arg else: array_args[obj] = {0: arg} array_strides[obj] = {0: stride} else: msg = f"Unsupported type {type(obj)}." raise TypeError(msg) # output fields for obj, counts in di.write_counter.items(): assert counts is not None if npw.array_equal(counts, 0): continue if isinstance(obj, di.IndexedCounterTypes): assert isinstance(obj, DiscreteScalarFieldView) dfield = obj args = array_args.setdefault(dfield, {}) strides = array_strides.setdefault(dfield, {}) if dfield not in mesh_infos: mesh_info_name = f"{dfield.var_name}_mesh_info" mesh_info = kernel_reqs["MeshInfoStruct"].build_codegen_variable( const=True, name=mesh_info_name ) mesh_infos[dfield] = mesh_info_name kargs[mesh_info_name] = mesh_info for i, count in enumerate(counts): if count == 0: continue vname = dfield.var_name + "_" + str(i) volatile = vname in ei.is_volatile arg, arg_strides = OpenClArrayBackend.build_codegen_arguments( kargs, name=vname, known_vars=csc.known_vars, symbolic_mode=csc.symbolic_mode, storage=self._global, ctype=dfield.ctype, typegen=typegen, mesh_dim=csc.varray_dim, ptr_restrict=True, const=False, volatile=volatile, ) assert i not in args assert i not in strides args[i] = arg strides[i] = arg_strides elif isinstance(obj, di.SimpleCounterTypes): assert isinstance( obj, (OpenClSymbolicArray, OpenClSymbolicBuffer, OpenClSymbolicNdBuffer), ), type(obj) assert counts > 0 vname = obj.varname volatile = vname in ei.is_volatile (arg, stride) = OpenClArrayBackend.build_codegen_arguments( kargs, name=vname, known_vars=csc.known_vars, symbolic_mode=csc.symbolic_mode, storage=self._global, ctype=obj.ctype, typegen=typegen, mesh_dim=csc.varray_dim, ptr_restrict=True, const=False, volatile=volatile, ) if isinstance(obj, (OpenClSymbolicBuffer, OpenClSymbolicNdBuffer)): csc.buffer_args[obj] = arg else: array_args[obj] = {0: arg} array_strides[obj] = {0: stride} else: msg = f"Unsupported type {type(obj)}." raise TypeError(msg) # parameters for argname, arg in csc.param_args.items(): param_args[argname] = arg kargs[argname] = arg # granularity if csc.granularity > 0: gidx = CodegenVectorClBuiltin( "gidx", "int", csc.vgranularity_dim, typegen=typegen, const=True ) kargs["gidx"] = gidx else: gidx = None # cache if not csc.local_size_known: lmem = CodegenVariable( storage=self._local, ctype="uchar", add_impl_const=True, name="buffer", ptr=True, ptr_restrict=True, typegen=typegen, nl=False, ) kargs["buffer"] = lmem msg = "Cannot handle dynamic local memory yet, " msg += "please specity local work group size as a known_vars." raise NotImplementedError(msg) else: lmem = None self.field_mesh_infos = mesh_infos self.array_args = array_args self.array_strides = array_strides self.param_args = param_args self.gidx = gidx self.lmem = lmem return kargs
def _generate_common_variables(self): tg = self.typegen csc = itype = csc.itype varray_dim = csc.varray_dim vectorization = csc.vectorization expr_info = csc.expr_info local_size = self.vars["local_size"] loop_id = CodegenVectorClBuiltin("vid", itype, varray_dim, typegen=tg) vectorization_var = CodegenVariable( "n", itype, tg, const=True, value=vectorization ) local_work = csc.local_work max_extra_vwork_var = CodegenVariable( "extra_vwork", csc.itype, typegen=tg, const=True, value=csc.max_extra_vwork ) local_work.init = "{}*({}-2*{})".format( vectorization_var, local_size[0], max_extra_vwork_var ) vzero = CodegenVectorClBuiltin( "vzero", itype, vectorization, typegen=tg, const=True, value=npw.zeros(vectorization), ) voffset = CodegenVectorClBuiltin( "voffset", itype, vectorization, typegen=tg, const=True, value=npw.arange(vectorization), ) azero = CodegenVectorClBuiltin( "azero", itype, varray_dim, typegen=tg, const=True, value=npw.zeros(varray_dim), ) compute_grid_size = csc.compute_grid_size self.loop_id = loop_id self.vectorization_var = vectorization_var self.local_work = local_work self.vzero = vzero self.voffset = voffset self.azero = azero self.max_extra_vwork_var = max_extra_vwork_var return ( compute_grid_size, loop_id, vectorization_var, max_extra_vwork_var, local_work, vzero, voffset, azero, ) def _generate_mesh_variables(self): field_mesh_infos = self.field_mesh_infos if not field_mesh_infos: declare_mesh_properties = False xmin, dx, inv_dx = None, None, None else: declare_mesh_properties = True mesh_info_0 = next(iter(field_mesh_infos.values())) dx = mesh_info_0["dx"].alias("dx", const=True) inv_dx = mesh_info_0["inv_dx"].alias("inv_dx", const=True) xmin = mesh_info_0["local_mesh"]["xmin"].alias("xmin", const=True) declare_mesh_properties = False self.dx = dx self.inv_dx = inv_dx self.xmin = xmin return declare_mesh_properties, xmin, dx, inv_dx def _generate_array_variables(self): array_args = self.array_args field_mesh_infos = self.field_mesh_infos tg = self.typegen csc = di = csc.expr_info.discretization_info varray_dim = csc.varray_dim vectorization = csc.vectorization compute_grid_size = csc.compute_grid_size itype = csc.itype vzero = self.vzero voffset = self.voffset azero = self.azero array_gids = SortedDict() array_vids = SortedDict() array_grid_ghosts = SortedDict() array_grid_sizes = SortedDict() array_line_data = SortedDict() array_local_data = SortedDict() array_local_rdata = SortedDict() array_private_data = SortedDict() array_values = SortedDict() local_size_per_field = SortedDict() local_mem_size = npw.int_zeros(shape=(3,)) has_private_loads, has_private_stores = False, False has_local_loads, has_local_stores = False, False for array, array_data in array_args.items(): if isinstance(array, OpenClSymbolicArray): name = array.varname elif isinstance(array, DiscreteScalarFieldView): name = array.var_name else: name = vindex = CodegenVectorClBuiltin( name + "_vid", itype, varray_dim, typegen=tg ) write_counts = di.write_counter.get(array, None) read_counts = di.read_counter.get(array, None) grid_size_varname = name + "_grid_size" ghosts_varname = name + "_ghosts" if array in field_mesh_infos: # array is a discrete cartesian field (with potentially some ghosts) mesh_info = field_mesh_infos[array] grid_size = mesh_info["local_mesh"]["resolution"].alias( grid_size_varname ) grid_ghosts = mesh_info["ghosts"].alias(ghosts_varname) else: # array is a numpy like array (without ghosts) grid_size = compute_grid_size.alias(grid_size_varname) grid_ghosts = azero.alias(ghosts_varname) indexed_line_data = array_line_data.setdefault(array, {}) indexed_gid = array_gids.setdefault(array, {}) indexed_local_data = array_local_data.setdefault(array, {}) indexed_local_rdata = array_local_rdata.setdefault(array, {}) indexed_private_data = array_private_data.setdefault(array, {}) indexed_values = array_values.setdefault(array, {}) if isinstance(array, OpenClSymbolicArray): array_ghosts =[array] is_read = (read_counts is not None) and (read_counts > 0) is_written = (write_counts is not None) and (write_counts > 0) is_ro = is_read and not is_written is_wo = is_written and not is_read is_rw = is_read and is_written gindex = CodegenVariable(f"{name}_gid", "ptrdiff_t", tg, const=True) line_data = array_data[0].newvar( f"line_{name}", init=f"{array_data[0]} $+ {gindex}" ) valname = name.lower() if valname == name: valname = f"_{valname}" ghosts = array_ghosts if ghosts == 0: var = CodegenVectorClBuiltin( valname, array.ctype, vectorization, typegen=tg, storage="__private", ) local_size_per_index = (0, 0, 0) elif csc.local_size_known: L = self.known_vars["local_size"] S = csc.vectorization * L[0] + 2 * ghosts var = CodegenArray( valname, dim=1, ctype=array.ctype, typegen=tg, shape=(S,), storage=self._local, ) local_size_per_index = (S, 0, 0) itemsize = array.dtype.itemsize local_mem_size[0] += S * itemsize if is_rw: rvar = CodegenArray( valname + "_r", dim=1, ctype=array.ctype, typegen=tg, shape=(2 * ghosts,), storage=self._local, ) local_mem_size[0] += 2 * ghosts * itemsize else: rvar = None else: init = "{} + {}*{} + {}".format( self.lmem, local_mem_size[1], self.vars["local_size"][0], local_mem_size[2], ) var = CodegenVariable( name=valname, typegen=typegen, ctype=array.ctype, ptr=True, ptr_restrict=True, const=False, storage=storage, add_impl_const=True, nl=True, init=init, ) local_size_per_index = (0, csc.vectorization, 2 * ghosts) itemsize = array.dtype.itemsize local_mem_size[1] += csc.vectorization * itemsize local_mem_size[2] += 2 * ghosts * itemsize if is_rw: init = "{} + {}*{} + {}".format( self.lmem, local_mem_size[1], self.vars["local_size"][0], local_mem_size[2], ) rvar = CodegenVariable( name=valname + "_r", typegen=typegen, ctype=array.ctype, ptr=True, ptr_restrict=True, const=False, storage=storage, add_impl_const=True, nl=True, init=init, ) local_mem_size[2] += 2 * ghosts * itemsize else: rvar = None msg = "Cannot handle offset to different types yet " msg += "(need to consider alignment)." raise NotImplementedError(msg) indexed_gid[0] = gindex indexed_line_data[0] = line_data indexed_values[0] = var if ghosts == 0: indexed_private_data[0] = var has_private_loads |= is_read has_private_stores |= is_written else: indexed_local_data[0] = var indexed_local_rdata[0] = rvar has_local_loads |= is_read has_local_stores |= is_written elif isinstance(array, CartesianDiscreteField): array_ghosts =[array] local_size_per_index = local_size_per_field.setdefault( array, npw.int_zeros(shape=(array.nb_components, 3)) ) for i, data in array_data.items(): is_read = (read_counts is not None) and (read_counts[i] > 0) is_written = (write_counts is not None) and (write_counts[i] > 0) is_ro = is_read and not is_written is_wo = is_written and not is_read is_rw = is_read and is_written gindex = CodegenVariable( f"{name}_{i}_gid", "ptrdiff_t", tg, const=True ) line_data = data.newvar( f"line_{name}_{i}", init=f"{data} $+ {gindex}" ) valname = name.lower() if valname == name: valname = f"_{valname}" valname += f"_{i}" ghosts = array_ghosts[i] if ghosts == 0: var = CodegenVectorClBuiltin( valname, array.ctype, vectorization, typegen=tg, storage="__private", ) local_size_per_index[i] = (0, 0, 0) elif csc.local_size_known: L = self.known_vars["local_size"] S = csc.vectorization * L[0] + 2 * ghosts var = CodegenArray( valname, dim=1, ctype=array.ctype, typegen=tg, shape=(S,), storage=self._local, ) local_size_per_index[i] = (S, 0, 0) itemsize = array.dtype.itemsize local_mem_size[0] += S * itemsize if is_rw: rvar = CodegenArray( valname + "_r", dim=1, ctype=array.ctype, typegen=tg, shape=(2 * ghosts,), storage=self._local, ) local_mem_size[0] += 2 * ghosts * itemsize else: rvar = None else: init = "{} + {}*{} + {}".format( self.lmem, local_mem_size[1], self.vars["local_size"][0], local_mem_size[2], ) var = CodegenVariable( name=valname, typegen=typegen, ctype=array.ctype, ptr=True, ptr_restrict=True, const=False, storage=storage, add_impl_const=True, nl=True, init=init, ) local_size_per_index[i] = (0, csc.vectorization, 2 * ghosts) itemsize = array.dtype.itemsize local_mem_size[1] += csc.vectorization * itemsize local_mem_size[2] += 2 * ghosts * itemsize if is_rw: init = "{} + {}*{} + {}".format( self.lmem, local_mem_size[1], self.vars["local_size"][0], local_mem_size[2], ) rvar = CodegenVariable( name=valname + "_r", typegen=typegen, ctype=array.ctype, ptr=True, ptr_restrict=True, const=False, storage=storage, add_impl_const=True, nl=True, init=init, ) local_mem_size[2] += 2 * ghosts * itemsize else: rvar = None msg = "Cannot handle offset to different types yet " msg += "(need to consider alignment)." raise NotImplementedError(msg) indexed_gid[i] = gindex indexed_line_data[i] = line_data indexed_values[i] = var if ghosts == 0: indexed_private_data[i] = var has_private_loads |= is_read has_private_stores |= is_written else: indexed_local_data[i] = var indexed_local_rdata[i] = rvar has_local_loads |= is_read has_local_stores |= is_written else: msg = f"Unsupported array type {type(array)}." raise TypeError(msg) if not indexed_local_data: array_local_data.pop(array) if not indexed_local_rdata: array_local_rdata.pop(array) if not indexed_private_data: array_private_data.pop(array) array_vids[array] = vindex array_grid_sizes[array] = grid_size array_grid_ghosts[array] = grid_ghosts self.array_vids = array_vids self.array_gids = array_gids self.array_line_data = array_line_data self.array_grid_sizes = array_grid_sizes self.array_grid_ghosts = array_grid_ghosts self.array_values = array_values self.array_local_data = array_local_data self.array_local_rdata = array_local_rdata self.array_private_data = array_private_data self.local_size_per_field = local_size_per_field self.local_mem_size = local_mem_size self.has_private_loads = has_private_loads self.has_private_stores = has_private_stores self.has_local_loads = has_local_loads self.has_local_stores = has_local_stores return ( array_gids, array_vids, array_values, array_grid_sizes, array_grid_ghosts, array_local_data, array_local_rdata, array_private_data, ) def _generate_inner_loop_variables(self): tg = self.typegen csc = itype = csc.itype compute_grid_size = csc.compute_grid_size vectorization = csc.vectorization loop_id = self.loop_id local_size = self.vars["local_size"] local_work = self.local_work current_local_work = csc.current_local_work self.current_local_work = current_local_work local_offset = csc.local_offset line_offset = csc.line_offset full_offset = csc.full_offset last_offset = csc.last_offset is_first = csc.is_first is_last = csc.is_last is_active = csc.is_active is_first_active = csc.is_first_active is_last_active = csc.is_last_active is_active_boundary = csc.is_active_boundary k = CodegenVariable("k", itype, tg) kmax = CodegenVariable( "kmax", itype, tg, const=True, init="(({}+{lwork}-1)/{lwork})".format( compute_grid_size[0], lwork=local_work ), ) self.k = k self.kmax = kmax return ( local_offset, line_offset, last_offset, full_offset, k, kmax, current_local_work, is_first, is_last, is_active, is_first_active, is_last_active, is_active_boundary, ) def _generate_loop_context(self): csc = itype = csc.itype array_dim = csc.array_dim compute_grid_size = csc.compute_grid_size kdim = csc.kernel_dim wdim = csc.work_dim granularity = csc.granularity loop_id = self.loop_id array_vids = self.array_vids array_gids = self.array_gids array_ghosts = self.array_grid_ghosts array_strides = self.array_strides local_id = self.vars["local_id"] global_id = self.vars["global_id"] local_size = self.vars["local_size"] global_size = self.vars["global_size"] local_work = self.local_work vectorization_var = self.vectorization_var gidx = self.gidx ( local_offset, line_offset, last_offset, full_offset, k, kmax, current_local_work, is_first, is_last, is_active, is_first_active, is_last_active, is_active_boundary, ) = self._generate_inner_loop_variables() if self.work_dim == 1: kmax.declare(self) last_offset.declare( self, init="{} - {}*({}-1)*{}".format( compute_grid_size[0], vectorization_var, kmax, local_size[0] ), ) if granularity > 0: self.jumpline() self.decl_vars(*tuple([loop_id] + list(array_vids.values()))) if self.field_mesh_infos: x0 = csc.space_symbols[symbolic_space_symbols[0]] self.decl_vars(x0) if csc.array_dim > 1: self.decl_vars( *tuple( csc.space_symbols[symbolic_space_symbols[i]] for i in range(1, csc.array_dim) ) ) i0 = csc.local_indices_symbols[symbolic_local_indices[0]] self.decl_vars(i0) if csc.array_dim > 1: self.decl_vars( *tuple( csc.local_indices_symbols[symbolic_local_indices[i]] for i in range(1, csc.array_dim) ) ) if granularity > 0: code = f"{loop_id[kdim:kdim+granularity]} = {gidx[:granularity]};" self.append(code) with self._align_() as al: for array, array_vid in array_vids.items(): ghosts = array_ghosts[array] code = "{} $= {} $+ {};".format( array_vid[kdim : kdim + granularity], loop_id[kdim : kdim + granularity], ghosts[kdim : kdim + granularity], ) al.append(code) for i in range(kdim, kdim + granularity): idx_i = csc.local_indices_symbols[symbolic_local_indices[i]] code = idx_i.affect(self, init=loop_id[i]) if self.field_mesh_infos: fmi = self.field_mesh_infos[next(iter(array_vids))] for i in range(kdim, kdim + granularity): xi = csc.space_symbols[symbolic_space_symbols[i]] code = "{xi} = {x0} + {vid}*{dx};".format( xi=xi, vid=next(iter(array_vids.values()))[i], voffset=self.voffset, x0=fmi["local_mesh"]["xmin"][i], dx=fmi["dx"][i], ) self.append(code) self.jumpline() @contextlib.contextmanager def work_iterate(i): try: if i > 0: j0 = global_id[i] if (i < wdim) else "0" gsize = global_size[i] if (i < wdim) else "1" j = loop_id[i] N = compute_grid_size[i] decl = "" unroll = False else: j0 = "0" gsize = "1" j = k N = kmax decl = f"{itype} " unroll = not csc.tuning_mode with self._for_( "{decl}{j}={j0}; {j}<{N}; {j}+={gsize}".format( decl=decl, j=j, j0=j0, gsize=gsize, N=N ), unroll=unroll, ) as ctx: if i > 0: with self._align_() as al: al.jumpline() for vid, ghosts in zip( array_vids.values(), array_ghosts.values() ): al.append( "{} $= {} $+ {};".format( vid[i], loop_id[i], ghosts[i] ) ) al.jumpline() idx_i = csc.local_indices_symbols[symbolic_local_indices[i]] code = idx_i.affect(self, init=loop_id[i]) if self.field_mesh_infos: arr, vid = next( iter( filter( lambda kv: kv[0] in self.field_mesh_infos, array_vids.items(), ) ) ) fmi = self.field_mesh_infos[arr] xi = csc.space_symbols[symbolic_space_symbols[i]] code = "{xi} = {x0} + {vid}*{dx};".format( xi=xi, vid=vid[i], voffset=self.voffset, x0=fmi["local_mesh"]["xmin"][i], dx=fmi["dx"][i], ) self.append(code) if i == 1: kmax.declare(self) last_offset.declare( self, init="{} - {}*({}-1)*{}".format( compute_grid_size[0], vectorization_var, kmax, local_size[0], ), ) elif i == 0: with self._align_() as al: line_offset.declare( al, align=True, const=True, init="{}*{}".format(k, local_work), ) local_offset.declare( al, align=True, const=True, init="{}*({}-{})".format( vectorization_var, local_id[0], self.max_extra_vwork_var, ), ) full_offset.declare( al, align=True, const=True, init="{}+{}".format(line_offset, local_offset), ) self.jumpline() with self._align_() as al: al.append(f"{loop_id[0]} $= {line_offset};") for vid, ghosts in zip( array_vids.values(), array_ghosts.values() ): al.append(f"{vid[0]} $= {loop_id[0]} + {ghosts[0]};") idx_i = csc.local_indices_symbols[symbolic_local_indices[0]] idx_i.affect( al, init=f"{full_offset}+{self.voffset}", align=True ) if self.field_mesh_infos: arr, vid = next( iter( filter( lambda kv: kv[0] in self.field_mesh_infos, array_vids.items(), ) ) ) fmi = self.field_mesh_infos[arr] xi = csc.space_symbols[symbolic_space_symbols[i]] code = "{xi} = {x0} + convert_{vftype}({vid}+{voffset}+{lo})*{dx};" code = code.format( xi=xi, vid=vid[i], lo=local_offset, voffset=self.voffset, vftype=csc.vftype, x0=fmi["local_mesh"]["xmin"][i], dx=fmi["dx"][i], ) self.append(code) with self._align_() as al: is_first.declare(al, align=True, init=f"({k}==0)") is_last.declare(al, align=True, init=f"({k}=={kmax}-1)") init = ( "({fo} >= -{n}*{evwork}) && ({fo} < {S}+{n}*{evwork})" ) init = init.format( fo=full_offset, n=self.vectorization_var, S=compute_grid_size[0], evwork=self.max_extra_vwork_var, ) is_active.declare(al, init=init, align=True) init = "{} && ({} < 0)".format( is_active, full_offset, vectorization_var, compute_grid_size[0], ) is_first_active.declare(al, init=init, align=True) init = "{} && ({}+{} > {})".format( is_active, full_offset, vectorization_var, compute_grid_size[0], ) is_last_active.declare(al, init=init, align=True) init = f"({is_first_active} || {is_last_active})" is_active_boundary.declare(al, init=init, align=True) current_local_work.declare( al, align=True, init="({} ? {} : {})".format( is_last, f"{compute_grid_size[0]} - {k}*{local_work}", local_work, ), ) self.jumpline() if self.array_vids: self.comment("Compute global offsets and line pointers") with self._align_() as al: for array, vid in array_vids.items(): gids = array_gids[array] strides = array_strides[array] for key, gid in gids.items(): stride = strides[key] idot = " $+ ".join( f"{vid[i]}*{stride[i]}" for i in range(array_dim - 1, -1, -1) ) gid.declare(al, init=idot, align=True) self.jumpline() self.decl_aligned_vars( *tuple( aij for ai in self.array_line_data.values() for aij in ai.values() ) ) yield ctx except: raise nested_loops = [work_iterate(i) for i in range(kdim - 1, -1, -1)] return nested_loops
[docs] def gencode(self): s = self csc = tg = s.typegen expr_info = csc.expr_info ftype = csc.ftype global_id = s.vars["global_id"] local_id = s.vars["local_id"] group_id = s.vars["group_id"] global_size = s.vars["global_size"] local_size = s.vars["local_size"] num_groups = s.vars["num_groups"] field_mesh_infos = {k: s.args[v] for (k, v) in self.field_mesh_infos.items()} self.field_mesh_infos = field_mesh_infos ( compute_grid_size, loop_id, vectorization_var, max_extra_vwork_var, local_work, vzero, voffset, azero, ) = s._generate_common_variables() ( array_gids, array_vids, array_values, array_grid_sizes, array_grid_ghosts, array_local_data, array_local_rdata, array_private_data, ) = s._generate_array_variables() event = CodegenVariable("evt", "event_t", tg, init="0") with s._kernel_(): s.jumpline() s.comment("Common kernel indices and sizes") s.decl_aligned_vars(global_id, local_id, group_id, const=True) s.decl_aligned_vars(global_size, local_size, num_groups, const=True) s.comment("Common variables") s.decl_aligned_vars( compute_grid_size, azero, vzero, voffset, vectorization_var, max_extra_vwork_var, local_work, ) s.comment("Array specific variables") s.decl_aligned_vars(*tuple(array_grid_sizes.values()), const=True) s.decl_aligned_vars(*tuple(array_grid_ghosts.values()), const=True) s.comment("Global memory arrays") s.decl_aligned_vars( *(aij for ai in s.array_args.values() for aij in ai.values()) ) s.decl_aligned_vars(*csc.buffer_args.values()) s.comment("Local and private memory arrays") s.decl_aligned_vars( *( aij for ai in tuple(array_local_data.values()) + tuple(array_local_rdata.values()) + tuple(array_private_data.values()) for aij in filter(lambda x: x, ai.values()) ) ) s.comment("Iterating over array lines") nested_loops = self._generate_loop_context() with nested(*nested_loops): s.load_data(event, local_id) s.jumpline() s.compute() s.jumpline() s.store_data(event, local_id)
# s.edit() # s.test_compile() # import sys # sys.exit(1)
[docs] def compute(self): s = self s.comment("Compute expressions") for fcall in self.fcalls: fcall.fn_kwds["offset"] = if "dx" in fcall.fn_kwds: mesh_info_0 = next(iter(self.field_mesh_infos.values())) dx = mesh_info_0["dx"][0] fcall.fn_kwds["dx"] = dx with s._block_(): s.generate_expr_code()
[docs] def load_data(self, event, local_id): s = self csc = di = csc.expr_info.discretization_info has_local_loads = self.has_local_loads has_private_loads = self.has_private_loads has_local_right_cache = False lid = local_id[0] if not (has_local_loads or has_private_loads): return s.comment("Loading data from global memory.") with s._block_(): if has_local_loads: _ldata, _lrdata, _gdata, _ghosts = (), (), (), () for array in s.array_local_data: local_data = s.array_local_data[array] local_rdata = s.array_local_rdata[array] global_data = s.array_line_data[array] min_ghosts = csc.array_contiguous_ghosts[array] read_counts = self.fmt_counter(di.read_counter.get(array, None)) if read_counts is None: continue for i in local_data: if read_counts[i] == 0: continue _ldata += (local_data[i],) _lrdata += (local_rdata[i],) _gdata += (global_data[i],) _ghosts += (min_ghosts[i],) has_local_right_cache |= local_rdata[i] is not None event.declare(s) s.comment("Copy previously loaded data from right to left.") with s._if_(f"!{csc.is_first}"): with s._align_() as al: for ldata, lrdata, gdata, ghosts in zip( _ldata, _lrdata, _gdata, _ghosts ): cond = f"{lid}<2*{ghosts}" lhs = ldata[lid] if lrdata: rhs = lrdata[lid] else: rhs = ldata[f"{s.local_work}+{lid}"] if csc.use_short_circuit: code = f"({cond}) $&& ({lhs} $= {rhs}, true);" else: code = f"if( {cond} ) ${{ {lhs} $= {rhs}; }}" al.append(code) s.barrier(_local=True) s.comment("Load right local memory from global memory.") with s._if_(csc.is_first): with s._align_() as al: for ldata, lrdata, gdata, ghosts in zip( _ldata, _lrdata, _gdata, _ghosts ): src = f"{gdata}$-{ghosts}" dst = ldata num_elements = f"{s.current_local_work}$+2*{ghosts}" code = s.async_work_group_copy( dst, src, num_elements, event, align=True ) al.append(code) with s._else_(): with s._align_() as al: for ldata, lrdata, gdata, ghosts in zip( _ldata, _lrdata, _gdata, _ghosts ): src = f"{gdata}$+{ghosts}" dst = f"{ldata}$+2*{ghosts}" num_elements = f"{s.current_local_work}" code = s.async_work_group_copy( dst, src, num_elements, event, align=True ) al.append(code) if has_private_loads: s.comment("Load private data from global memory") ptrs, dsts, default_vals = (), (), () for array in s.array_private_data: private_data = s.array_private_data[array] global_data = s.array_line_data[array] read_counts = self.fmt_counter(di.read_counter.get(array, None)) if read_counts is None: continue for i in private_data: if read_counts[i] == 0: continue dval = CustomSymbolicFunction.default_out_of_bounds_value( ctype_to_dtype(global_data[i].ctype) ) dsts += (private_data[i],) ptrs += (global_data[i],) default_vals += (dval,) cond = "({fo}+{i} >= 0) && ({fo}+{i} < {})" fcond = lambda i: cond.format( csc.compute_grid_size[0], fo=csc.full_offset, i=i ) s.multi_vload_if( csc.is_active_boundary, fcond, csc.vectorization, csc.local_offset, ptrs, dsts, default_vals, use_short_circuit=csc.use_short_circuit, else_cond=csc.is_active, ) if has_local_loads: s.comment("Wait for local memory transactions to finish") code = s.wait_group_events(1, f"&{event}") s.append(code) if has_local_right_cache: s.comment("Copy right loaded local data for read-write arrays.") with s._if_(f"!{csc.is_last}"): with s._align_() as al: for ldata, lrdata, gdata, ghosts in zip( _ldata, _lrdata, _gdata, _ghosts ): if lrdata is None: continue cond = f"{lid}<2*{ghosts}" lhs = lrdata[lid] rhs = ldata[f"{s.local_work}+{lid}"] if csc.use_short_circuit: code = f"({cond}) $&& ({lhs} $= {rhs}, true);" else: code = f"if( {cond} ) ${{ {lhs} $= {rhs}; }}" al.append(code) if has_local_loads or has_local_right_cache: s.barrier(_local=True)
[docs] def store_data(self, event, local_id): s = self csc = di = csc.expr_info.discretization_info has_local_stores = self.has_local_stores has_private_stores = self.has_private_stores lid = local_id[0] if not (has_local_stores or has_private_stores): return s.comment("Loading data back to global memory.") with s._block_(): if has_local_stores: _ldata, _lrdata, _gdata, _ghosts = (), (), (), () for array in s.array_local_data: local_data = s.array_local_data[array] local_rdata = s.array_local_rdata[array] global_data = s.array_line_data[array] min_ghosts = csc.array_contiguous_ghosts[array] write_counts = self.fmt_counter(di.write_counter.get(array, None)) if write_counts is None: continue for i in local_data: if write_counts[i] == 0: continue _ldata += (local_data[i],) _lrdata += (local_rdata[i],) _gdata += (global_data[i],) _ghosts += (min_ghosts[i],) event.declare(s) s.comment("Load local memory to global memory.") with s._align_() as al: for ldata, lrdata, gdata, ghosts in zip( _ldata, _lrdata, _gdata, _ghosts ): dst = f"{gdata}" src = f"{ldata}+{ghosts}" num_elements = f"{s.current_local_work}" code = s.async_work_group_copy( dst, src, num_elements, event, align=True ) al.append(code) if has_private_stores: s.comment("Load private data to global memory") ptrs, srcs = (), () for array in s.array_private_data: private_data = s.array_private_data[array] global_data = s.array_line_data[array] write_counts = self.fmt_counter(di.write_counter.get(array, None)) if write_counts is None: continue for i in private_data: if write_counts[i] == 0: continue srcs += (private_data[i],) ptrs += (global_data[i],) cond = "({fo}+{i} >= 0) && ({fo}+{i} < {})" fcond = lambda i: cond.format( csc.compute_grid_size[0], fo=csc.full_offset, i=i ) s.multi_vstore_if( csc.is_active_boundary, fcond, csc.vectorization, csc.local_offset, srcs, ptrs, use_short_circuit=csc.use_short_circuit, else_cond=csc.is_active, ) if has_local_stores: s.comment("Wait for local memory transactions to finish") code = s.wait_group_events(1, f"&{event}") s.append(code) s.barrier(_local=True)
[docs] def fmt_counter(self, count): if isinstance(count, int): return {0: count} else: return count